# libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost
import openpyxl
# libraries for data manipulation and model building
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.tree import DecisionTreeRegressor # used for imputing missing values
from sklearn.model_selection import KFold,train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
# from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.preprocessing import OneHotEncoder
# import warnings
# import pandas_access as mdb
# warnings.filterwarnings('ignore')
# %matplotlib inline
This project features the 2023 national data of the YouthRisk Behavior Surveillance System (YRBSS) survey conducted by the Centers for Disease Control and Prevention (CDC). It is developed in 1990 to monitor health risk behaviors among high school students in the United States and has provided insights into the leading causes of death, disability, and social problems among adolescents. Alongside with the student demographics (sex, sexual identity, race and ethnicity, and grade), a total of 107 survey questions includes youth health behaviors and conditions, substance use behaviors, and student experiences.
The goal of this initial EDA is to understand the dataset and the patterns as well as the possible co-occurence of health risk behaviors.
Apart from two free text entry variables recording the respondent's height in meters (q6) and weight in kilograms (q7) that are also used to calculate the Bosy Mass Index (BMI) in the formula: BMI = Weight/Height^2, most of the survey questions require respondents to select from categorized choices defining the extent of the described situation in the question. There are 20103 entries collected for 2023. More on data edits and the detailed code book can be found in the User Guide for 2023 YRBS.
Changes to note from the 2021 national dataset is that substance use weighted percentage has not change significantly (by 0.03%) within the 2 years, mental health related problems have been alleviated by a small scale as we can see that the rate of respondents that have seriously considered attempting suicide decreased from 22.2% to 20.4%. We will continue to investigate the changes between survey result distributions implement this project.
The national dataset does not have state nor region identifiers because the national samples are not constructed to provide representative data at state or region levels.
file_path = "XXHq_20223.xlsx"
# Read the Excel file
df = pd.read_excel(file_path, engine="openpyxl")
df.drop(columns = ['orig_rec'], inplace = True)
print(df.head())
site raceeth q6orig q7orig record q1 q2 q3 q4 q5 ... q102 \ 0 XX NaN 505 180 1 3.0 1.0 1.0 NaN C ... 2.0 1 XX 5.0 N N 233 2 4.0 2.0 1.0 2.0 E ... 2.0 2 XX 5.0 506 165 3 5.0 2.0 3.0 2.0 E ... 1.0 3 XX 5.0 N N 105 4 6.0 1.0 2.0 2.0 E ... 2.0 4 XX 5.0 601 125 5 3.0 2.0 1.0 2.0 E ... 2.0 q103 q104 q105 q106 q107 BMIPCT weight stratum psu 0 3.0 3.0 2.0 2.0 2.0 97.083855 0.8614 103 16294 1 2.0 5.0 2.0 1.0 2.0 NaN 0.8920 103 16294 2 2.0 4.0 2.0 2.0 1.0 92.262516 0.5081 103 16294 3 4.0 5.0 2.0 1.0 1.0 NaN 1.1759 103 16294 4 2.0 5.0 2.0 2.0 1.0 7.567053 0.8920 103 16294 [5 rows x 116 columns]
# Drop columns Q88 to Q107 for not being in standard survey and too much missing values which may not contribute to the project
columns_to_drop = [f"q{num}" for num in range(88, 108)] # Generates a list of q88 to q107
df = df.drop(columns=columns_to_drop, errors='ignore') # Ignore errors if column doesn't exist
# Drop columns "q6orig" and "q7orig" for we have the calculated response stored for q6 and q7, so we no longer need the original record.
# Drop not meaningful column "site" as it originally represents the geographic location of the school or district where the survey data is collected, but was masked from the national data sample.
df = df.drop(columns=["q6orig", "q7orig", "site"], errors='ignore')
print(df.head())
raceeth record q1 q2 q3 q4 q5 q6 q7 q8 ... q82 \ 0 NaN 1 3.0 1.0 1.0 NaN C 1.65 81.65 4.0 ... 2.0 1 5.0 2 4.0 2.0 1.0 2.0 E NaN NaN 5.0 ... 2.0 2 5.0 3 5.0 2.0 3.0 2.0 E 1.68 74.84 5.0 ... 2.0 3 5.0 4 6.0 1.0 2.0 2.0 E NaN NaN 4.0 ... 2.0 4 5.0 5 3.0 2.0 1.0 2.0 E 1.85 56.70 5.0 ... 2.0 q83 q84 q85 q86 q87 BMIPCT weight stratum psu 0 1.0 1.0 3.0 1.0 NaN 97.083855 0.8614 103 16294 1 2.0 3.0 5.0 1.0 4.0 NaN 0.8920 103 16294 2 1.0 2.0 1.0 1.0 3.0 92.262516 0.5081 103 16294 3 1.0 3.0 4.0 1.0 3.0 NaN 1.1759 103 16294 4 1.0 3.0 3.0 1.0 4.0 7.567053 0.8920 103 16294 [5 rows x 93 columns]
print(df.columns.tolist())
print(df.info())
['raceeth', 'record', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'q11', 'q12', 'q13', 'q14', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20', 'q21', 'q22', 'q23', 'q24', 'q25', 'q26', 'q27', 'q28', 'q29', 'q30', 'q31', 'q32', 'q33', 'q34', 'q35', 'q36', 'q37', 'q38', 'q39', 'q40', 'q41', 'q42', 'q43', 'q44', 'q45', 'q46', 'q47', 'q48', 'q49', 'q50', 'q51', 'q52', 'q53', 'q54', 'q55', 'q56', 'q57', 'q58', 'q59', 'q60', 'q61', 'q62', 'q63', 'q64', 'q65', 'q66', 'q67', 'q68', 'q69', 'q70', 'q71', 'q72', 'q73', 'q74', 'q75', 'q76', 'q77', 'q78', 'q79', 'q80', 'q81', 'q82', 'q83', 'q84', 'q85', 'q86', 'q87', 'BMIPCT', 'weight', 'stratum', 'psu'] <class 'pandas.core.frame.DataFrame'> RangeIndex: 20103 entries, 0 to 20102 Data columns (total 93 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 raceeth 19733 non-null float64 1 record 20103 non-null int64 2 q1 20005 non-null float64 3 q2 19945 non-null float64 4 q3 19910 non-null float64 5 q4 19849 non-null float64 6 q5 18714 non-null object 7 q6 17814 non-null float64 8 q7 17814 non-null float64 9 q8 15071 non-null float64 10 q9 19598 non-null float64 11 q10 18976 non-null float64 12 q11 17865 non-null float64 13 q12 19595 non-null float64 14 q13 11374 non-null float64 15 q14 18982 non-null float64 16 q15 17974 non-null float64 17 q16 11652 non-null float64 18 q17 17460 non-null float64 19 q18 14577 non-null float64 20 q19 17302 non-null float64 21 q20 15752 non-null float64 22 q21 15599 non-null float64 23 q22 19266 non-null float64 24 q23 18286 non-null float64 25 q24 19902 non-null float64 26 q25 19898 non-null float64 27 q26 19863 non-null float64 28 q27 19667 non-null float64 29 q28 18366 non-null float64 30 q29 19336 non-null float64 31 q30 14635 non-null float64 32 q31 17442 non-null float64 33 q32 16820 non-null float64 34 q33 19739 non-null float64 35 q34 12157 non-null float64 36 q35 19804 non-null float64 37 q36 19091 non-null float64 38 q37 13925 non-null float64 39 q38 18923 non-null float64 40 q39 18942 non-null float64 41 q40 15210 non-null float64 42 q41 19248 non-null float64 43 q42 19202 non-null float64 44 q43 15441 non-null float64 45 q44 14414 non-null float64 46 q45 14305 non-null float64 47 q46 16603 non-null float64 48 q47 19239 non-null float64 49 q48 19600 non-null float64 50 q49 19527 non-null float64 51 q50 16847 non-null float64 52 q51 14226 non-null float64 53 q52 19322 non-null float64 54 q53 18981 non-null float64 55 q54 16495 non-null float64 56 q55 15097 non-null float64 57 q56 16121 non-null float64 58 q57 18573 non-null float64 59 q58 17645 non-null float64 60 q59 18339 non-null float64 61 q60 16448 non-null float64 62 q61 18312 non-null float64 63 q62 17944 non-null float64 64 q63 15381 non-null float64 65 q64 18100 non-null float64 66 q65 18301 non-null float64 67 q66 11795 non-null float64 68 q67 13936 non-null float64 69 q68 16608 non-null float64 70 q69 19027 non-null float64 71 q70 15336 non-null float64 72 q71 15245 non-null float64 73 q72 15276 non-null float64 74 q73 15212 non-null float64 75 q74 15183 non-null float64 76 q75 14268 non-null float64 77 q76 18876 non-null float64 78 q77 13876 non-null float64 79 q78 12964 non-null float64 80 q79 15509 non-null float64 81 q80 15203 non-null float64 82 q81 15519 non-null float64 83 q82 12778 non-null float64 84 q83 17924 non-null float64 85 q84 15705 non-null float64 86 q85 17441 non-null float64 87 q86 14541 non-null float64 88 q87 16535 non-null float64 89 BMIPCT 17814 non-null float64 90 weight 20103 non-null float64 91 stratum 20103 non-null int64 92 psu 20103 non-null int64 dtypes: float64(89), int64(3), object(1) memory usage: 14.3+ MB None
print(df.describe())
raceeth record q1 q2 q3 \
count 19733.000000 20103.000000 20005.000000 19945.000000 19910.000000
mean 5.010845 10140.386161 4.892677 1.504437 2.361477
std 1.832258 5863.836462 1.218455 0.499993 1.101724
min 1.000000 1.000000 1.000000 1.000000 1.000000
25% 5.000000 5062.500000 4.000000 1.000000 1.000000
50% 5.000000 10129.000000 5.000000 2.000000 2.000000
75% 6.000000 15203.500000 6.000000 2.000000 3.000000
max 8.000000 20386.000000 7.000000 2.000000 5.000000
q4 q6 q7 q8 q9 \
count 19849.000000 17814.000000 17814.000000 15071.000000 19598.000000
mean 1.798630 1.696357 68.680140 4.310796 1.324625
std 0.401034 0.102717 18.466386 0.975215 0.899069
min 1.000000 1.320000 31.300000 1.000000 1.000000
25% 2.000000 1.630000 55.790000 4.000000 1.000000
50% 2.000000 1.700000 64.410000 5.000000 1.000000
75% 2.000000 1.780000 77.110000 5.000000 1.000000
max 2.000000 2.030000 180.990000 5.000000 5.000000
... q82 q83 q84 q85 \
count ... 12778.000000 17924.00000 15705.000000 17441.000000
mean ... 2.005869 1.57755 2.825215 3.472679
std ... 0.346559 1.18036 1.217580 1.376641
min ... 1.000000 1.00000 1.000000 1.000000
25% ... 2.000000 1.00000 2.000000 3.000000
50% ... 2.000000 1.00000 3.000000 4.000000
75% ... 2.000000 2.00000 4.000000 4.000000
max ... 3.000000 5.00000 5.000000 7.000000
q86 q87 BMIPCT weight stratum \
count 14541.000000 16535.000000 1.781400e+04 20103.000000 20103.000000
mean 1.146001 2.206229 6.337997e+01 1.000000 228.370243
std 0.803579 1.491575 2.931229e+01 1.270645 73.287806
min 1.000000 1.000000 1.597498e-08 0.002300 101.000000
25% 1.000000 1.000000 4.149272e+01 0.153200 201.000000
50% 1.000000 2.000000 6.887086e+01 0.414300 212.000000
75% 1.000000 3.000000 9.017639e+01 1.359400 300.000000
max 7.000000 7.000000 9.996278e+01 22.543700 300.000000
psu
count 20103.00000
mean 414571.68925
std 235438.54837
min 16294.00000
25% 214944.00000
50% 401089.00000
75% 573255.00000
max 788430.00000
[8 rows x 92 columns]
# missing values
missing_counts_df = df.isnull().sum()
missing_percent_df = df.isnull().mean()
missing_df = pd.DataFrame({'missing_counts': missing_counts_df,
'missing_percent': missing_percent_df}).sort_values(
by='missing_percent', ascending=False)
missing_df[missing_df['missing_percent'] > 0.3]
| missing_counts | missing_percent | |
|---|---|---|
| q13 | 8729 | 0.434214 |
| q16 | 8451 | 0.420385 |
| q66 | 8308 | 0.413272 |
| q34 | 7946 | 0.395264 |
| q82 | 7325 | 0.364373 |
| q78 | 7139 | 0.355121 |
| q77 | 6227 | 0.309755 |
| q37 | 6178 | 0.307317 |
| q67 | 6167 | 0.306770 |
Questions with the highest missing percentages (>30%) are:
Q13 - During the past 12 months, on how many days did you carry a gun?
Q16 - During the past 12 months, how many times were you in a physical fight?
Q66 - How do you describe your weight?
Q34 - During the past 30 days, on the days you smoked, how many cigarettes did you smoke per day?
Q82 - During the past 12 months, have you been tested for a sexuality transmitted disease (STD) other than HIV, such as chlamydia or gonorrhea?
Q78 - During the past 12 months, on how many sports teams did you play?
Q77 - In an average week when you are in school on how many days do you go to physical education (PE) classes?
Q37 - During the past 30 days, how did you usually get your electronic vapor?
Q67 - Which of the following are you trying to do about your weight?
q4 1 q35 1 q31 1 q26 1 q24 1 q25 1 q19 1 q18 1 q56 1 dtype: int64
# unique value counts
cols_to_drop = df.nunique().sort_values()[df.nunique().sort_values() == 1]
cols_to_drop
Series([], dtype: int64)
# Define response mappings
response_mappings = {
"q1": {1: 12, 2: 13, 3: 14, 4: 15, 5: 16, 6: 17, 7: 18},
"q2": {1: 0, 2: 1},
"q3": {1: 9, 2: 10, 3: 11, 4: 12, 5: 13}, ## E.Ungraded or other grade is represented by 13
"q4": {1: 1, 2: 0},
"q8": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, ## seat belt use, starts from 0=never, to 4=always
"q9": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
"q10": {1: 0, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2}, # binned, 0=did not drive, 1=not drunk driving, 2=drunk drove at least once
"q11": {1: 0, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2, 7: 3, 8: 3}, # binned, 0=did not drive, 1=not text driving, 2=texted 1-20 days, 3=over 20 days
"q12": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
"q13": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
"q14": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
"q15": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6.5, 6: 8.5, 7: 10.5, 8: 12},
"q16": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6.5, 6: 8.5, 7: 10.5, 8: 12},
"q17": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6.5, 6: 8.5, 7: 10.5, 8: 12},
"q18": {1: 1, 2: 0},
"q19": {1: 1, 2: 0},
"q20": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
"q21": {1: 0, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2}, # binned, 0=did not date, 1=0 time, 2=at least once
"q22": {1: 0, 2: 1, 3: 2, 4: 2, 5: 2, 6: 2}, # binned, 0=did not date, 1=0 time, 2=at least once
"q23": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4},
"q24": {1: 1, 2: 0},
"q25": {1: 1, 2: 0},
"q26": {1: 1, 2: 0},
"q27": {1: 1, 2: 0},
"q28": {1: 1, 2: 0},
"q29": {1: 0, 2: 1, 3: 2.5, 4: 4.5, 5: 6},
"q30": {1: 2, 2: 1, 3: 0}, # ever injured in suicide, 2=did not attempt, 1=yes, 0=no
"q31": {1: 1, 2: 0}, # Ever smoked a cigarette (Yes=1, No=0)
"q32": {1: 0, 2: 8, 3: 9.5, 4: 11.5, 5: 13.5, 6: 15.5, 7: 17}, # 0=never smoked, Age of first cigarette use (average age for range)
"q33": {1: 0, 2: 1.5, 3: 3.5, 4: 7.5, 5: 14.5, 6: 24.5, 7: 30}, # Days smoked cigarettes in the past 30 days
"q34": {1: 0, 2: 0.5, 3: 1, 4: 3.5, 5: 8, 6: 15.5, 7: 20},
"q35": {1: 1, 2: 0}, # Ever used electronic vapor products (Yes=1, No=0)
"q36": {1: 0, 2: 1.5, 3: 3.5, 4: 7.5, 5: 14.5, 6: 24.5, 7: 30}, # Days used vapor products in the past 30 days
"q37": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}, # Categorical, Source of vapor products (categorical mapping)
"q38": {1: 0, 2: 1.5, 3: 3.5, 4: 7.5, 5: 14.5, 6: 24.5, 7: 30}, # Days used smokeless tobacco in past 30 days
"q39": {1: 0, 2: 1.5, 3: 4, 4: 7.5, 5: 14.5, 6: 24.5, 7: 30}, # Days smoked cigars in past 30 days
"q40": {1: 0, 2: 1, 3: 2}, # consider as binned/categorized, Attempted to quit tobacco (Yes=1, No=2, Did not use=0)
"q41": {1: 0, 2: 8, 3: 9.5, 4: 11.5, 5: 13.5, 6: 15.5, 7: 17}, # 0=never had a drink, Age of first alcohol use
"q42": {1: 0, 2: 1.5, 3: 4, 4: 7.5, 5: 14.5, 6: 24.5, 7: 30}, # Days drank alcohol in past 30 days
"q43": {1: 0, 2: 1, 3: 2, 4: 4, 5: 7.5, 6: 14.5, 7: 20}, # Days engaged in binge drinking
"q44": {1: 0, 2: 1.5, 3: 3, 4: 4, 5: 5, 6: 6.5, 7: 8.5, 8: 10}, # Largest number of drinks consumed
"q45": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}, # Categorical, Source of alcohol
"q46": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 69.5, 7: 100}, # times used marijuana
"q47": {1: 0, 2: 8, 3: 9.5, 4: 11.5, 5: 13.5, 6: 15.5, 7: 17}, # 0=never used, Age of first marijuana use
"q48": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40}, # times used marijuana in past 30 days
"q49": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40}, # Ever used prescription pain medicine without prescription
"q50": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40}, # Ever used cocaine
"q51": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40}, # Ever used inhalants
"q52": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40}, # Ever used heroin
"q53": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40}, # Ever used methamphetamines
"q54": {1: 0, 2: 1.5, 3: 6, 4: 14.5, 5: 29.5, 6: 40}, # Ever used ecstasy
"q55": {1: 0, 2: 1, 3: 2}, # Ever injected any illegal drug
"q56": {1: 1, 2: 0}, # Ever had sexual intercourse
"q57": {1: 0, 2: 11, 3: 12, 4: 13, 5: 14, 6: 15, 7: 16, 8: 17}, # 0=never had sex, Age at first sex
"q58": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}, # Number of lifetime sexual partners
"q59": {1: 0, 2: 0.5, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 8: 6}, # Number of sexual partners in past 3 months
"q60": {1: 2, 2: 1, 3: 0}, # Categorical. Drank alcohol/used drugs before last sex (Yes=1, No=0, never had sex=2)
"q61": {1: 2, 2: 1, 3: 0}, # Categorical. Used a condom last time had sex (Yes=1, No=0), never had sex = 2
"q62": {1: 7, 2: 0, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 8: 6}, # Categorical. Birth control method used, never had sex = 7
"q63": {1: 3, 2: 0, 3: 1, 4: 2}, # Categorical. Sex of sexual contacts (never had sex=3, female=0, male=1, both=2)
"q64": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: pd.NA}, # Categorical. Sexual identity, pd.NA for don't know what the question is asking (equals to missing value)
"q65": {1: 0, 2: 1, 3: 2, 4: pd.NA}, # Categorical. Transgender identitym pd.NA for response 4 not knowing what the question is asking
"q66": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, # Categorical. Perception of weight (0 = very underweight, 4 = very overweight)
"q67": {1: 1, 2: 3, 3: 2, 4: 0}, # Categorical. Weight control behavior (1 = trying to lose, 2= stay the same, 3 = trying to gain, 0 = nothing)
"q68": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28}, # Times drank fruit juice in past 7 days
"q69": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28}, # Times ate fruit in past 7 days
"q70": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28}, # Times ate green salad in past 7 days
"q71": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28}, # Times ate potatoes in past 7 days
"q72": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28}, # Times ate carrots in past 7 days
"q73": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28}, # Times ate other vegetables in past 7 days
"q74": {1: 0, 2: 2, 3: 5, 4: 7, 5: 14, 6: 21, 7:28}, # Times drank soda in past 7 days
"q75": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}, # Days ate breakfast in past 7 days
"q76": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}, # Days physically active for 60 minutes
"q77": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5}, # Days attended PE class in an average week
"q78": {1: 0, 2: 1, 3: 2, 4: 3}, # Number of sports teams played on in past 12 months
"q79": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, # Times had a concussion from sports in past 12 months
"q80": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}, # Categocial. Frequency level of social media use
"q81": {1: 1, 2: 0, 3: 2}, # Categorical. Ever tested for HIV (Yes=1, No=0, Not sure=2)
"q82": {1: 1, 2: 0, 3: 2}, # Categorical. Tested for STD in past 12 months (Yes=1, No=0, Not sure=2)
"q83": {1: 0, 2: 1, 3: 2, 4: 3, 5: pd.NA}, # Last dental visit (0=past year, 3=never, pd.NA for not sure)
"q84": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}, # Frequency level of poor mental health in past 30 days
"q85": {1: 4, 2: 5, 3: 6, 4: 7, 5: 8, 6: 9, 7: 10}, # Average sleep hours on school night
"q86": {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}, # Categorical. Usual place of sleep
"q87": {1: 4, 2: 3, 3: 2, 4: 1, 5: 0, 6: 5, 7: pd.NA} # Grades in school (4=A, 3=B, ..., 0=F, 5=none of the grades, pd.NA for notsure)
}
# Apply transformations to Q1 to Q87
for col in df.columns:
if col.startswith("q") and col[1:].isdigit(): # Ensure it's a question column
q_number = int(col[1:])
if q_number <= 87 and col in response_mappings: # Only process Q1 to Q87
df[col] = df[col].map(response_mappings[col])
# Save the transformed dataset
transformed_file_path = "Transformed_XXHq_20223.xlsx"
df.to_excel(transformed_file_path, index=False)
print(f"Transformed dataset saved as: {transformed_file_path}")
Transformed dataset saved as: Transformed_XXHq_20223.xlsx
# unique value counts
cols_to_drop = df.nunique().sort_values()[df.nunique().sort_values() == 1]
cols_to_drop
Series([], dtype: int64)
Consider the following questions as categorical, as either their first response option is not related to the rest or does not form a meaningful level.
"q10", "q11", "q21", "q22", "q30", "q32", "q37", "q40", "q41", "q45", "q57", "q59", "q60", "q61", "q62", "q63", "q64", "q65", "q66", "q67", "q80", "q81", "q82", "q86"
Consider using the QN dataset for a probit model. Remember to check for normalization assumption.
df = pd.read_excel(transformed_file_path, engine="openpyxl")
# Cramer's V statistic for catagorical association.
excluded_columns = ["q5", "BMIPCT", "q6", "q7", "weight", "stratum", "psu", "record"]
categorical_df = df.drop(columns=excluded_columns, errors='ignore')
# Compute Cramér's V
def cramers_v(x, y):
"""Calculate Cramér's V statistic for categorical-categorical association."""
confusion_matrix = pd.crosstab(x, y)
chi2 = stats.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
r, k = confusion_matrix.shape
return np.sqrt(chi2 / (n * (min(r, k) - 1))) if n > 0 else np.nan
# For all categorical variables
def cramers_v_matrix(dataframe):
"""Computes Cramér's V for all pairs of categorical variables in a DataFrame."""
cols = dataframe.columns
n = len(cols)
cramers_v_df = pd.DataFrame(np.zeros((n, n)), index=cols, columns=cols)
for i in range(n):
for j in range(i, n):
if i == j:
cramers_v_df.iloc[i, j] = 1.0 # Perfect correlation with itself
else:
try:
v = cramers_v(dataframe[cols[i]], dataframe[cols[j]])
cramers_v_df.iloc[i, j] = v
cramers_v_df.iloc[j, i] = v # Symmetric matrix
except Exception:
cramers_v_df.iloc[i, j] = np.nan
cramers_v_df.iloc[j, i] = np.nan
return cramers_v_df
# Compute Cramér's V matrix
cramers_v_results = cramers_v_matrix(categorical_df)
print(cramers_v_results)
# plot heatmap for the Cramer's V statistic
plt.figure(figsize=(20, 20))
sns.heatmap(cramers_v_results, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Cramer's V statistic for categorical variables")
plt.show()
raceeth q1 q2 q3 q4 q8 q9 \
raceeth 1.000000 0.044846 0.028691 0.041114 1.000000 0.072129 0.043661
q1 0.044846 1.000000 0.047639 0.604506 0.048649 0.091398 0.069488
q2 0.028691 0.047639 1.000000 0.025431 0.014294 0.057296 0.044664
q3 0.041114 0.604506 0.025431 1.000000 0.037300 0.080089 0.051610
q4 1.000000 0.048649 0.014294 0.037300 1.000000 0.040144 0.057473
... ... ... ... ... ... ... ...
q83 0.065789 0.083819 0.034040 0.070505 0.041340 0.075665 0.057041
q84 0.047925 0.037839 0.320959 0.030393 0.025802 0.054634 0.053271
q85 0.045667 0.052175 0.059903 0.057220 0.014175 0.096911 0.060306
q86 0.042926 0.119990 0.031433 0.061834 0.047074 0.113816 0.085788
q87 0.100342 0.041993 0.139030 0.047755 0.147813 0.111825 0.054568
q10 q11 q12 ... q78 q79 q80 \
raceeth 0.127186 0.103859 0.034408 ... 0.068120 0.041916 0.040620
q1 0.295016 0.251001 0.064292 ... 0.051153 0.049902 0.043308
q2 0.065098 0.060452 0.077037 ... 0.074276 0.060687 0.136509
q3 0.264071 0.233007 0.043071 ... 0.050002 0.043780 0.045682
q4 0.079977 0.075568 0.020578 ... 0.075611 0.036500 0.044324
... ... ... ... ... ... ... ...
q83 0.082599 0.035261 0.057427 ... 0.058835 0.075878 0.052961
q84 0.061932 0.036719 0.043507 ... 0.071981 0.040012 0.097822
q85 0.077413 0.046771 0.046831 ... 0.087207 0.054950 0.061799
q86 0.156038 0.062620 0.096786 ... 0.031774 0.127250 0.091787
q87 0.089512 0.048363 0.060165 ... 0.094252 0.060909 0.054917
q81 q82 q83 q84 q85 q86 q87
raceeth 0.059174 0.056257 0.065789 0.047925 0.045667 0.042926 0.100342
q1 0.102338 0.083515 0.083819 0.037839 0.052175 0.119990 0.041993
q2 0.030828 0.069393 0.034040 0.320959 0.059903 0.031433 0.139030
q3 0.081917 0.064238 0.070505 0.030393 0.057220 0.061834 0.047755
q4 0.035587 0.018089 0.041340 0.025802 0.014175 0.047074 0.147813
... ... ... ... ... ... ... ...
q83 0.022376 0.027702 1.000000 0.069054 0.084704 0.146813 0.093364
q84 0.042790 0.044632 0.069054 1.000000 0.151414 0.054875 0.076286
q85 0.067809 0.065841 0.084704 0.151414 1.000000 0.062269 0.104456
q86 0.093209 0.097662 0.146813 0.054875 0.062269 1.000000 0.062734
q87 0.048236 0.046393 0.093364 0.076286 0.104456 0.062734 1.000000
[85 rows x 85 columns]
# Convert Cramér's V matrix to a sorted list of variable pairs
cramers_v_pairs = (
cramers_v_results
.stack()
.reset_index()
.rename(columns={'level_0': 'Variable 1', 'level_1': 'Variable 2', 0: 'Cramers V'})
)
# Remove duplicate pairs (keep only upper triangle of matrix)
cramers_v_pairs = cramers_v_pairs[cramers_v_pairs["Variable 1"] != cramers_v_pairs["Variable 2"]]
# Sort by Cramér's V in descending order
cramers_v_pairs = cramers_v_pairs.sort_values(by="Cramers V", ascending=False)
# Display sorted associations
print(cramers_v_pairs.head(30))
Variable 1 Variable 2 Cramers V 4813 q59 q56 1.000000 4728 q58 q56 1.000000 4898 q60 q56 1.000000 4983 q61 q56 1.000000 4561 q56 q59 1.000000 4560 q56 q58 1.000000 4563 q56 q61 1.000000 4562 q56 q60 1.000000 2409 q31 q32 1.000000 2493 q32 q31 1.000000 340 q4 raceeth 1.000000 4 raceeth q4 1.000000 4643 q57 q56 1.000000 4559 q56 q57 1.000000 5068 q62 q56 0.947679 4564 q56 q62 0.947679 4989 q61 q62 0.823033 5073 q62 q61 0.823033 5153 q63 q56 0.756632 4565 q56 q63 0.756632 687 q11 q10 0.735762 603 q10 q11 0.735762 4732 q58 q60 0.731473 4900 q60 q58 0.731473 4901 q60 q59 0.727589 4817 q59 q60 0.727589 2321 q30 q29 0.721190 2237 q29 q30 0.721190 4648 q57 q61 0.717324 4984 q61 q57 0.717324
# Convert Cramer's V matrix to a sorted list of variable pairs
cramers_v_pairs = (
cramers_v_results
.stack()
.reset_index()
.rename(columns={'level_0': 'Variable 1', 'level_1': 'Variable 2', 0: 'Cramers V'})
)
cramers_v_pairs["Pair"] = cramers_v_pairs.apply(lambda x: tuple(sorted([x["Variable 1"], x["Variable 2"]])), axis=1)
# Drop duplicate pairs (keep only unique combinations)
cramers_v_pairs = cramers_v_pairs.drop_duplicates(subset=["Pair"]).drop(columns=["Pair"])
# Remove self-correlations
cramers_v_pairs = cramers_v_pairs[cramers_v_pairs["Variable 1"] != cramers_v_pairs["Variable 2"]]
# Sort in descending order of Cramer's V
cramers_v_pairs = cramers_v_pairs.sort_values(by="Cramers V", ascending=False)
# Print the top 30 unique pairs
print(cramers_v_pairs.head(25))
Variable 1 Variable 2 Cramers V 4 raceeth q4 1.000000 4560 q56 q58 1.000000 4561 q56 q59 1.000000 4562 q56 q60 1.000000 4563 q56 q61 1.000000 2409 q31 q32 1.000000 4559 q56 q57 1.000000 4564 q56 q62 0.947679 4989 q61 q62 0.823033 4565 q56 q63 0.756632 603 q10 q11 0.735762 4732 q58 q60 0.731473 4817 q59 q60 0.727589 2237 q29 q30 0.721190 4648 q57 q61 0.717324 4647 q57 q60 0.715309 4903 q60 q61 0.714463 4733 q58 q61 0.713312 4818 q59 q61 0.710113 4904 q60 q62 0.677560 2754 q35 q37 0.667169 2753 q35 q36 0.664059 2757 q35 q40 0.658417 2065 q27 q28 0.644388 2763 q35 q46 0.637556
Highly correlated questions as above.
Q61 Q62 0.823 Condom use during last sex vs. Birth control method used last time
Q10 Q11 0.736 Riding with a drinking driver vs. Driving after drinking alcohol
Q58 Q60 0.731 Number of sex partners vs. Alcohol/drug use before last sex
Q59 Q60 0.728 Current sexual activity vs. Alcohol/drug use before last sex
Q29 Q30 0.721 Suicide attempt vs. Suicide attempt requiring medical attention
Q57 Q61 0.717 Age of first sexual intercourse vs. Condom use last time
Q57 Q60 0.715 Age of first sex vs. Alcohol/drug use before last sex
Q60 Q61 0.714 Alcohol/drug use before last sex vs. Condom use last time
Q58 Q61 0.713 Number of sex partners vs. Condom use last time
Q59 Q61 0.710 Current sexual activity vs. Condom use last time
Q60 Q62 0.678 Alcohol/drug use before last sex vs. Birth control method used
Q1 Q3 0.605 Age vs. Grade level
Q2 Q63 0.567 Sex vs. Sex of sexual contacts
Q52 Q55 0.560 Ever used cocaine vs. Ever injected an illegal drug
Q60 Q63 0.540 Alcohol/drug use before last sex vs. Sex of sexual contacts
Q42 Q43 0.539 Current alcohol use vs. Binge drinking
Q36 Q40 0.539 Ever electronic vapor product use vs. Tried quitting all tobacco products
Q33 Q34 0.537 Current cigarette use vs. Heavy cigarette smoking (>10 per day)
Q61 Q63 0.536 Condom use vs. Sex of sexual contacts
Q37 Q40 0.533 Current electronic vapor product use vs. Tried quitting all tobacco products
Q54 Q55 0.516 Ever used ecstasy vs. Ever injected an illegal drug
Q53 Q55 0.514 Ever used methamphetamine vs. Ever injected an illegal drug
Q81 Q82 0.513 Ever tested for HIV vs. Ever tested for STDs
Q20 Q21 0.511 Forced sexual intercourse vs. Dating sexual violence
Q58 Q59 0.492 Number of sex partners vs. Current sexual activity\
Strong association is seen among sexual behaviors and risky practices. Q57 (Age at first sex), Q58 (Number of sex partners), Q59 (Current sexual activity), Q60 (Alcohol/drug use before last sex), Q61 (Condom use), and Q62 (Birth control method used) are highly interrelated. One possible explanation is that individuals with earlier sexual debut (Q57) tend to have more partners (Q58) and are more likely to engage in risky behaviors like substance use before sex (Q60) and not using protection (Q61, Q62). To account for these correlations, we are combining several question reponses to new binary indicators of risky sexual behaviors. This will also help reduce the dimensionality.
q34, q40, q43 and q44 is dropped because of largely missing values and not meaningful implications.
# A few extra feature engineering to account for collinearity.
# Comment out the ones you wish not to create
df["q10_11_unsafe_driving"] = ((df["q10"] > 1) | (df["q11"] > 1)).astype(int)
df["q27_30_suicide_combined"] = ((df["q30"]) != 0 | (df["q29"] != 0) | (df["q28"] != 0) | (df["q27"] != 0)).astype(int)
df["q33_smoking"] = (df["q33"] != 0).astype(int)
df["q48_55_substance_use"] = (df.loc[:, "q48":"q55"] != 0).any(axis=1).astype(int) # q48 to 55 combined as substance_use indicator
df["q57_early_sexual_intercourse"] = (df["q57"] != 0).astype(int) # first sexual intercourse age question transoformed to binary variable maerked as 1 if q57 response is not 0 (never had sex)
df["q60_62_unsafe_sex"] = ((df["q60"] == 0) | (df["q61"] == 0) | (df["q62"].isin([0, 5]))).astype(int) # as long as one of the related question matches, this variable will be 1
df["q68_73_healthy_diet"] = (df.loc[:, "q68":"q73"].ne(0).sum(axis=1) >= 3).astype(int) # healthy_diet = 1 if at least 3 of q68-73 are not 0
df["q81_82_std_checked"] = ((df["q81"] == 1) | (df["q82"] == 1)).astype(int)
df["suicide_attempt"] = (df["q29"] != 0).astype(int)
## storing possible target variable, choose as you wish. Raw questions are not one hot encoded yet.
q27_raw = df["q27"]
q28_raw = df["q28"]
q29_raw = df["q29"]
q30_raw = df["q30"]
suicide_q29_attempt = df["suicide_attempt"]
suicide_combined_q = df["q27_30_suicide_combined"]
df = df.drop(columns=["q10", "q11", "q27", "q28", "q29", "q30", "q33", "q48", "q49", "q50", "q51", "q52", "q53", "q54",
"q55", "q60", "q61", "q62", "q68", "q69", "q70", "q71", "q72", "q73", "q81", "q82", "suicide_combined", "psu", "stratum", "record"],
errors='ignore'
)
# renaming for clarity, add more if you would like to.
df = df.rename(columns={'q1': 'q1_age',
'q2': 'q2_gender',
'q3': 'q3_grade',
'q4': 'q4_ethnicity',
'q5': 'q5_race',
'q6': 'q6_height',
'q7': 'q7_weight',
'q8': 'q8_seatbelt_riding',
'q9': 'q9_ride_w_drunk_drinver',
'q12': 'q12_days_carrying_weapon_atschool',
'q13': 'q13_days_carrying_gun',
'q14': 'q14_skipped_school_bc_insecure',
'q15': 'q15_been_threatened_at_school',
'q16': 'q16_12mo_physical_fight',
'q17': 'q17_12mo_physicalf_school',
'q18': 'q18_seen_vio_neighborhood',
'q19': 'q19_forced_sex',
'q20': 'q20_forced_sexual',
'q21': 'q21_dating_unwilling_sexual',
'q22': 'q22_dating_physical_hurting',
'q23': 'q23_discriminated',
'q24': 'q24_been_bullied',
'q25': 'q25_electronically_bullied',
'q26': 'q26_depression',
'q31': 'q31_ever_smoked',
'q32': 'q32_first_smoke_age',
'q34': 'q34_smoke_count_daily',
'q35': 'q35_ever_vaped',
'q36': 'q36_days_vaped_30d',
'q37': 'q37_vapor_source',
'q38': 'q38_days_chewed_30d',
'q39': 'q39_days_smoked_30d',
'q40': 'q40_quit_tabacco_12mo',
'q41': 'q41_first_drink_age',
'q42': 'q42_days_drank_30d',
'q43': 'q43_days_binge_drink_30d',
'q44': 'q44_max_drinks_30d',
'q45': 'q45_alcohol_source',
'q46': 'q46_times_marijuana',
'q47': 'q47_first_marijuana_age',
'q56': 'q56_early_sex',
'q57': 'q57_age_first_sex',
'q58': 'q58_sexual_partners_count',
'q59': 'q59_sex_count_3mo',
'q63': 'q63_sex_contact_gender',
'q64': 'q64_sexual_identity',
'q65': 'q65_transgender',
'q66': 'q66_weight_perception',
'q67': 'q67_weight_plans',
'q74': 'q74_soda_count',
'q75': 'q75_days_breakfast_1w',
'q76': 'q76_days_physically_active_1w',
'q77': 'q77_days_PE_class_1w',
'q78': 'q78_sports_team_count_12mo',
'q79': 'q79_concussion_count_12mo',
'q80': 'q80_social_media_freq',
'q83': 'q83_last_dentist',
'q84': 'q84_mental_health_not_good',
'q85': 'q85_workdays_average_sleep',
'q86': 'q86_unstable_housing',
'q87': 'q87_academic_grades'
}
)
df = df.drop(columns=['q4_ethnicity', 'q5_race'],
errors='ignore'
)
print(df.head(5))
raceeth q1_age q2_gender q3_grade q6_height q7_weight \ 0 NaN 14.0 0.0 9.0 1.65 81.65 1 5.0 15.0 1.0 9.0 NaN NaN 2 5.0 16.0 1.0 11.0 1.68 74.84 3 5.0 17.0 0.0 10.0 NaN NaN 4 5.0 14.0 1.0 9.0 1.85 56.70 q8_seatbelt_riding q9_ride_w_drunk_drinver \ 0 3.0 4.5 1 4.0 0.0 2 4.0 2.5 3 3.0 0.0 4 4.0 0.0 q12_days_carrying_weapon_atschool q13_days_carrying_gun ... weight \ 0 0.0 0.0 ... 0.8614 1 0.0 0.0 ... 0.8920 2 0.0 0.0 ... 0.5081 3 0.0 0.0 ... 1.1759 4 0.0 0.0 ... 0.8920 q10_11_unsafe_driving q27_30_suicide_combined q33_smoking \ 0 0 1 0 1 0 1 0 2 1 1 0 3 1 1 0 4 0 1 0 q48_55_substance_use q57_early_sexual_intercourse q60_62_unsafe_sex \ 0 1 0 0 1 0 0 0 2 1 1 1 3 1 1 1 4 0 1 1 q68_73_healthy_diet q81_82_std_checked suicide_attempt 0 0 0 0 1 0 0 0 2 1 0 0 3 0 0 0 4 1 0 0 [5 rows x 71 columns]
STOP HERE¶
Cramer's V after rename, feature creation and combine.¶
# Cramer's V statistic for catagorical association.
excluded_columns = ["BMIPCT", "q6_height", "q7_weight", "weight"]
categorical_df = df.drop(columns=excluded_columns, errors='ignore')
# Compute Cramér's V
def cramers_v(x, y):
"""Calculate Cramér's V statistic for categorical-categorical association."""
confusion_matrix = pd.crosstab(x, y)
chi2 = stats.chi2_contingency(confusion_matrix)[0]
n = confusion_matrix.sum().sum()
r, k = confusion_matrix.shape
return np.sqrt(chi2 / (n * (min(r, k) - 1))) if n > 0 else np.nan
# For all categorical variables
def cramers_v_matrix(dataframe):
"""Computes Cramér's V for all pairs of categorical variables in a DataFrame."""
cols = dataframe.columns
n = len(cols)
cramers_v_df = pd.DataFrame(np.zeros((n, n)), index=cols, columns=cols)
for i in range(n):
for j in range(i, n):
if i == j:
cramers_v_df.iloc[i, j] = 1.0 # Perfect correlation with itself
else:
try:
v = cramers_v(dataframe[cols[i]], dataframe[cols[j]])
cramers_v_df.iloc[i, j] = v
cramers_v_df.iloc[j, i] = v # Symmetric matrix
except Exception:
cramers_v_df.iloc[i, j] = np.nan
cramers_v_df.iloc[j, i] = np.nan
return cramers_v_df
# Compute Cramér's V matrix
cramers_v_results = cramers_v_matrix(categorical_df)
# Convert Cramer's V matrix to a sorted list of variable pairs
cramers_v_pairs = (
cramers_v_results
.stack()
.reset_index()
.rename(columns={'level_0': 'Variable 1', 'level_1': 'Variable 2', 0: 'Cramers V'})
)
cramers_v_pairs["Pair"] = cramers_v_pairs.apply(lambda x: tuple(sorted([x["Variable 1"], x["Variable 2"]])), axis=1)
# Drop duplicate pairs (keep only unique combinations)
cramers_v_pairs = cramers_v_pairs.drop_duplicates(subset=["Pair"]).drop(columns=["Pair"])
# Remove self-correlations
cramers_v_pairs = cramers_v_pairs[cramers_v_pairs["Variable 1"] != cramers_v_pairs["Variable 2"]]
# Sort in descending order
cramers_v_pairs = cramers_v_pairs.sort_values(by="Cramers V", ascending=False)
print(cramers_v_pairs.head(25))
Variable 1 Variable 2 Cramers V 2518 q56_early_sex q58_sexual_partners_count 1.000000 2519 q56_early_sex q59_sex_count_3mo 1.000000 1429 q31_ever_smoked q32_first_smoke_age 1.000000 2608 q57_age_first_sex q57_early_sexual_intercourse 1.000000 2517 q56_early_sex q57_age_first_sex 1.000000 2675 q58_sexual_partners_count q57_early_sexual_intercourse 0.996213 1601 q34_smoke_count_daily q33_smoking 0.994110 2742 q59_sex_count_3mo q57_early_sexual_intercourse 0.983372 2541 q56_early_sex q57_early_sexual_intercourse 0.979905 2542 q56_early_sex q60_62_unsafe_sex 0.944343 2743 q59_sex_count_3mo q60_62_unsafe_sex 0.916442 2609 q57_age_first_sex q60_62_unsafe_sex 0.913534 2676 q58_sexual_partners_count q60_62_unsafe_sex 0.913521 4217 q57_early_sexual_intercourse q60_62_unsafe_sex 0.786396 2520 q56_early_sex q63_sex_contact_gender 0.756632 2809 q63_sex_contact_gender q57_early_sexual_intercourse 0.756523 2810 q63_sex_contact_gender q60_62_unsafe_sex 0.670078 1634 q35_ever_vaped q37_vapor_source 0.667169 1633 q35_ever_vaped q36_days_vaped_30d 0.664059 1637 q35_ever_vaped q40_quit_tabacco_12mo 0.658417 1643 q35_ever_vaped q46_times_marijuana 0.637556 1644 q35_ever_vaped q47_first_marijuana_age 0.636909 70 q1_age q3_grade 0.604506 175 q2_gender q63_sex_contact_gender 0.567337 2109 q42_days_drank_30d q43_days_binge_drink_30d 0.539460
# Define categorical variables to analyze (excluding the target variable itself)
columns = [col for col in df.columns if df[col].nunique() <= 15] # Filter categorical vars
n_cols = 2
n_rows = -(-len(columns) // n_cols)
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, n_rows * 5))
axes = axes.flatten()
for i, col in enumerate(columns):
df[col].value_counts(normalize=True).plot(kind='bar', colormap='viridis', ax=axes[i])
# Formatting
axes[i].set_title(f'Distribution of {col}', fontsize=14)
axes[i].set_xlabel(col, fontsize=12)
axes[i].set_ylabel('Proportion', fontsize=12)
axes[i].tick_params(axis='x', rotation=45)
# Remove empty subplots if there are any
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
group_var = "suicide_attempt"
# Define categorical variables to analyze (excluding the target variable itself)
columns = [col for col in df.columns if col != group_var and df[col].nunique() <= 15] # Filter categorical vars
n_cols = 2
n_rows = -(-len(columns) // n_cols)
fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(15, n_rows * 5))
axes = axes.flatten()
for i, col in enumerate(columns):
crosstab = pd.crosstab(df[col], df[group_var], normalize='index') * 100
# Plot as stacked bar chart
crosstab.plot(kind='bar', stacked=True, colormap='viridis', ax=axes[i])
# Formatting
axes[i].set_title(f'{col} vs {group_var}', fontsize=14)
axes[i].set_xlabel(col, fontsize=12)
axes[i].set_ylabel('Percentage', fontsize=12)
axes[i].legend(title=group_var, loc='upper right')
axes[i].tick_params(axis='x', rotation=45)
# Remove empty subplots if there are any
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
plt.show()
# List of numerical columns to plot
numerical_columns = ['BMIPCT', 'q6_height', 'q7_weight']
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 10)) # 1 row, 3 columns
for i, col in enumerate(numerical_columns):
sns.boxplot(x='suicide_attempt', y=col, data=df, ax=axes[i], palette="viridis")
# Formatting
axes[i].set_title(f'Distribution of {col} by Suicide Attempts', fontsize=14)
axes[i].set_xlabel('Suicide Attempts', fontsize=12)
axes[i].set_ylabel(col, fontsize=12)
plt.tight_layout()
plt.show()
C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\3025135869.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='suicide_attempt', y=col, data=df, ax=axes[i], palette="viridis") C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\3025135869.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='suicide_attempt', y=col, data=df, ax=axes[i], palette="viridis") C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\3025135869.py:7: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.boxplot(x='suicide_attempt', y=col, data=df, ax=axes[i], palette="viridis")
# List of categorical columns to plot
columns = ['suicide_attempt', 'raceeth']
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(5, 10))
axes = axes.flatten()
for i, col in enumerate(columns):
value_counts = df[col].value_counts(normalize=True) * 100
sns.barplot(x=value_counts.index, y=value_counts.values, ax=axes[i], palette="viridis")
# Formatting
axes[i].set_title(f'Distribution of {col} (%)', fontsize=14)
axes[i].set_xlabel(col, fontsize=12)
axes[i].set_ylabel('Percentage', fontsize=12)
axes[i].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()
C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\770245347.py:9: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=value_counts.index, y=value_counts.values, ax=axes[i], palette="viridis") C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\770245347.py:9: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=value_counts.index, y=value_counts.values, ax=axes[i], palette="viridis")
column = 'raceeth'
# Define the legend mapping
raceeth_legend = {
1: "Indian/Alaskan Native",
2: "Asian",
3: "Black or African American",
4: "Native Hawaiian/Other PI",
5: "White",
6: "Hispanic/Latino",
7: "Multiple - Hispanic",
8: "Multiple - Non-Hispanic"
}
# Calculate value counts and normalize to percentages
value_counts = df[column].value_counts(normalize=True) * 100
# Create the bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=value_counts.index, y=value_counts.values, palette="viridis")
# Formatting
plt.title(f'Distribution of {column} (%)', fontsize=14)
plt.xlabel("Race/Ethnicity", fontsize=12)
plt.ylabel('Percentage', fontsize=12)
# plt.xticks(ticks=value_counts.index, labels=[raceeth_legend[val] for val in value_counts.index], rotation=45)
# Add legend
legend_labels = [f"{key} = {value}" for key, value in raceeth_legend.items() if key in value_counts.index]
plt.legend(legend_labels, title="Race/Ethnicity", loc='upper right', fontsize=10)
# Show plot
plt.tight_layout()
plt.show()
C:\Users\Ivy\AppData\Local\Temp\ipykernel_22724\1541197035.py:19: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=value_counts.index, y=value_counts.values, palette="viridis")
print(df.shape)
(20103, 71)
EDA for PRESENTATION
df['suicide_attempt'].value_counts()
suicide_attempt 0 17341 1 2762 Name: count, dtype: int64
df['raceeth'].value_counts(normalize=True)*100
raceeth 5.0 49.156236 7.0 14.118482 8.0 9.192723 3.0 9.076167 1.0 6.760249 6.0 6.121725 2.0 5.042315 4.0 0.532104 Name: proportion, dtype: float64
df['raceeth'].value_counts(normalize=True)*100
raceeth 5.0 49.156236 7.0 14.118482 8.0 9.192723 3.0 9.076167 1.0 6.760249 6.0 6.121725 2.0 5.042315 4.0 0.532104 Name: proportion, dtype: float64
ONE LAST THING
Depending on how you want to impute the missing data, either dropping the not important columns or impute with decision trees. I will leave that to you.
# unique value counts
cols_to_drop = df.nunique().sort_values()[df.nunique().sort_values() == 1]
cols_to_drop
Series([], dtype: int64)
df = df.drop(columns=cols_to_drop.index, errors='ignore')
df.shape
(20103, 71)
# missing values
missing_counts_df = df.isnull().sum()
missing_percent_df = df.isnull().mean()
missing_df = pd.DataFrame({'missing_counts': missing_counts_df,
'missing_percent': missing_percent_df}).sort_values(
by='missing_percent', ascending=False)
missing_df[missing_df['missing_percent'] > 0]
| missing_counts | missing_percent | |
|---|---|---|
| q13_days_carrying_gun | 8729 | 0.434214 |
| q16_12mo_physical_fight | 8451 | 0.420385 |
| q66_weight_perception | 8308 | 0.413272 |
| q34_smoke_count_daily | 7946 | 0.395264 |
| q78_sports_team_count_12mo | 7139 | 0.355121 |
| ... | ... | ... |
| q25_electronically_bullied | 205 | 0.010197 |
| q24_been_bullied | 201 | 0.009999 |
| q3_grade | 193 | 0.009601 |
| q2_gender | 158 | 0.007860 |
| q1_age | 98 | 0.004875 |
61 rows × 2 columns
df[missing_df.index[:5]].describe()
| q13_days_carrying_gun | q16_12mo_physical_fight | q66_weight_perception | q34_smoke_count_daily | q78_sports_team_count_12mo | |
|---|---|---|---|---|---|
| count | 11374.000000 | 11652.000000 | 11795.000000 | 12157.000000 | 12964.000000 |
| mean | 0.157464 | 0.575781 | 2.155320 | 0.142387 | 0.922632 |
| std | 0.854565 | 1.745885 | 0.904338 | 1.325005 | 1.065088 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 1.000000 |
| 75% | 0.000000 | 0.000000 | 3.000000 | 0.000000 | 2.000000 |
| max | 6.000000 | 12.000000 | 4.000000 | 20.000000 | 3.000000 |
impute missing values using decision tree regressor.
# impute missing values using decision tree regressor
# get the columns with missing values greater than 0
cols = missing_df[missing_df['missing_percent'] > 0].index.tolist()
for col in cols:
print(f"Imputing missing values for column: {col}")
df_hidden = df.copy()
# impute missing values with Mode for columns other than 'col' in test_data
for s in df.columns.difference([col]):
df_hidden.loc[:, s] = df_hidden[s].fillna(df[s].mode()[0])
# Split the data into train and test sets
train_data = df_hidden.dropna()
test_data = df_hidden[df_hidden[col].isnull()]
X_train = train_data.drop(columns=[col])
y_train = train_data[col]
X_test = test_data.drop(columns=[col])
y_test = test_data[col]
# Train a decision tree regressor
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train)
# Predict the missing values
y_pred = tree.predict(X_test)
# Fill in the missing values
df.loc[df[col].isnull(), col] = y_pred
Imputing missing values for column: q13_days_carrying_gun Imputing missing values for column: q16_12mo_physical_fight Imputing missing values for column: q66_weight_perception Imputing missing values for column: q34_smoke_count_daily Imputing missing values for column: q78_sports_team_count_12mo Imputing missing values for column: q77_days_PE_class_1w Imputing missing values for column: q37_vapor_source Imputing missing values for column: q67_weight_plans Imputing missing values for column: q75_days_breakfast_1w Imputing missing values for column: q45_alcohol_source Imputing missing values for column: q44_max_drinks_30d Imputing missing values for column: q86_unstable_housing Imputing missing values for column: q18_seen_vio_neighborhood Imputing missing values for column: q8_seatbelt_riding Imputing missing values for column: q74_soda_count Imputing missing values for column: q80_social_media_freq Imputing missing values for column: q40_quit_tabacco_12mo Imputing missing values for column: q63_sex_contact_gender Imputing missing values for column: q43_days_binge_drink_30d Imputing missing values for column: q79_concussion_count_12mo Imputing missing values for column: q21_dating_unwilling_sexual Imputing missing values for column: q84_mental_health_not_good Imputing missing values for column: q20_forced_sexual Imputing missing values for column: q87_academic_grades Imputing missing values for column: q56_early_sex Imputing missing values for column: q83_last_dentist Imputing missing values for column: q46_times_marijuana Imputing missing values for column: q32_first_smoke_age Imputing missing values for column: q19_forced_sex Imputing missing values for column: q85_workdays_average_sleep Imputing missing values for column: q31_ever_smoked Imputing missing values for column: q17_12mo_physicalf_school Imputing missing values for column: q64_sexual_identity Imputing missing values for column: q58_sexual_partners_count Imputing missing values for column: BMIPCT Imputing missing values for column: q7_weight Imputing missing values for column: q6_height Imputing missing values for column: q15_been_threatened_at_school Imputing missing values for column: q65_transgender Imputing missing values for column: q23_discriminated Imputing missing values for column: q59_sex_count_3mo Imputing missing values for column: q57_age_first_sex Imputing missing values for column: q76_days_physically_active_1w Imputing missing values for column: q38_days_chewed_30d Imputing missing values for column: q39_days_smoked_30d Imputing missing values for column: q14_skipped_school_bc_insecure Imputing missing values for column: q36_days_vaped_30d Imputing missing values for column: q42_days_drank_30d Imputing missing values for column: q47_first_marijuana_age Imputing missing values for column: q41_first_drink_age Imputing missing values for column: q22_dating_physical_hurting Imputing missing values for column: q12_days_carrying_weapon_atschool Imputing missing values for column: q9_ride_w_drunk_drinver Imputing missing values for column: raceeth Imputing missing values for column: q35_ever_vaped Imputing missing values for column: q26_depression Imputing missing values for column: q25_electronically_bullied Imputing missing values for column: q24_been_bullied Imputing missing values for column: q3_grade Imputing missing values for column: q2_gender Imputing missing values for column: q1_age
df[cols[:5]].describe()
| q13_days_carrying_gun | q16_12mo_physical_fight | q66_weight_perception | q34_smoke_count_daily | q78_sports_team_count_12mo | |
|---|---|---|---|---|---|
| count | 20103.000000 | 20103.000000 | 20103.000000 | 20103.000000 | 20103.000000 |
| mean | 0.141397 | 0.565836 | 2.238770 | 0.200020 | 0.965925 |
| std | 0.802500 | 1.784646 | 0.877138 | 1.688298 | 1.086537 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 1.000000 |
| 75% | 0.000000 | 0.000000 | 3.000000 | 0.000000 | 2.000000 |
| max | 6.000000 | 12.000000 | 4.000000 | 20.000000 | 3.000000 |
df.to_csv("df_imputed_XXHq_20223.csv", index=False)
df.columns[:50]
Index(['raceeth', 'q1_age', 'q2_gender', 'q3_grade', 'q6_height', 'q7_weight',
'q8_seatbelt_riding', 'q9_ride_w_drunk_drinver',
'q12_days_carrying_weapon_atschool', 'q13_days_carrying_gun',
'q14_skipped_school_bc_insecure', 'q15_been_threatened_at_school',
'q16_12mo_physical_fight', 'q17_12mo_physicalf_school',
'q18_seen_vio_neighborhood', 'q19_forced_sex', 'q20_forced_sexual',
'q21_dating_unwilling_sexual', 'q22_dating_physical_hurting',
'q23_discriminated', 'q24_been_bullied', 'q25_electronically_bullied',
'q26_depression', 'q31_ever_smoked', 'q32_first_smoke_age',
'q34_smoke_count_daily', 'q35_ever_vaped', 'q36_days_vaped_30d',
'q37_vapor_source', 'q38_days_chewed_30d', 'q39_days_smoked_30d',
'q40_quit_tabacco_12mo', 'q41_first_drink_age', 'q42_days_drank_30d',
'q43_days_binge_drink_30d', 'q44_max_drinks_30d', 'q45_alcohol_source',
'q46_times_marijuana', 'q47_first_marijuana_age', 'q56_early_sex',
'q57_age_first_sex', 'q58_sexual_partners_count', 'q59_sex_count_3mo',
'q63_sex_contact_gender', 'q64_sexual_identity', 'q65_transgender',
'q66_weight_perception', 'q67_weight_plans', 'q74_soda_count',
'q75_days_breakfast_1w'],
dtype='object')
df.columns[50:100]
Index(['q76_days_physically_active_1w', 'q77_days_PE_class_1w',
'q78_sports_team_count_12mo', 'q79_concussion_count_12mo',
'q80_social_media_freq', 'q83_last_dentist',
'q84_mental_health_not_good', 'q85_workdays_average_sleep',
'q86_unstable_housing', 'q87_academic_grades', 'BMIPCT', 'weight',
'q10_11_unsafe_driving', 'q27_30_suicide_combined', 'q33_smoking',
'q48_55_substance_use', 'q57_early_sexual_intercourse',
'q60_62_unsafe_sex', 'q68_73_healthy_diet', 'q81_82_std_checked',
'suicide_attempt'],
dtype='object')